notebook.community

Edit and run



In [1]:

    
from setup import *
import sys
if DATA_PATH not in sys.path: sys.path.append(DATA_PATH)
from constants import *

%matplotlib inline
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 12)
pd.set_option('display.max_columns', 200)









    



/home/hobs/.virtualenvs/AgileMachineLearning/lib/python3.5/site-packages/matplotlib/__init__.py:1350: UserWarning:  This call to matplotlib.use() has no effect
because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

  warnings.warn(_use_error_msg)



In [2]:

    
df = pd.read_csv(os.path.join(DATA_PATH, 'deduped_tweets.csv.gz'), low_memory=False)



In [3]:

    
df = df.drop_duplicates('id', keep='last')[['id', 'id_str', 'text']]
df.id == df.id_str
(df.id != df.id_str).sum()









    Out[3]:





0



In [4]:

    
df = df[['id', 'text']]



In [5]:

    
df.text









    Out[5]:





0         #python never stop learning what you enjoy doi...
1         Watching Boa vs. Python — https://t.co/Pivpk02s2A
2         Monty Python - The silly walk https://t.co/C0J...
3         Senior Software Engineer Full Stack Python Dja...
4         Architect Django Solr Platform Engineer With P...
5                     peaceful rain? Python - inevitability
                                ...                        
183064    Las 3 mejores ides para Python Antes de empeza...
183065    Gagal tidur gegara habis vertical limit ada fi...
183066    Go boa wkwk💪😄 ★ Boa vs. Python — https://t.co/...
183067    RT @RealPython: List of Python API Wrappers &g...
183068    Watching Boa vs. Python — https://t.co/5THbrirfQO
183069    Чертова дюжина вакансий в IT и Digital /  / 1....
Name: text, dtype: object



In [6]:

    
df['tokens'] = df.text.str.split()
df









    Out[6]:






  
    
      
      id
      text
      tokens
    
  
  
    
      0
      724276510626979840
      #python never stop learning what you enjoy doi...
      [#python, never, stop, learning, what, you, en...
    
    
      1
      724276498249572352
      Watching Boa vs. Python — https://t.co/Pivpk02s2A
      [Watching, Boa, vs., Python, —, https://t.co/P...
    
    
      2
      724276388325412866
      Monty Python - The silly walk https://t.co/C0J...
      [Monty, Python, -, The, silly, walk, https://t...
    
    
      3
      725078887005347840
      Senior Software Engineer Full Stack Python Dja...
      [Senior, Software, Engineer, Full, Stack, Pyth...
    
    
      4
      725078874338541572
      Architect Django Solr Platform Engineer With P...
      [Architect, Django, Solr, Platform, Engineer, ...
    
    
      5
      725078868802068481
      peaceful rain? Python - inevitability
      [peaceful, rain?, Python, -, inevitability]
    
    
      ...
      ...
      ...
      ...
    
    
      183064
      724275847591546880
      Las 3 mejores ides para Python Antes de empeza...
      [Las, 3, mejores, ides, para, Python, Antes, d...
    
    
      183065
      724275810777985026
      Gagal tidur gegara habis vertical limit ada fi...
      [Gagal, tidur, gegara, habis, vertical, limit,...
    
    
      183066
      724275650043875328
      Go boa wkwk💪😄 ★ Boa vs. Python — https://t.co/...
      [Go, boa, wkwk💪😄, ★, Boa, vs., Python, —, http...
    
    
      183067
      724275609858392066
      RT @RealPython: List of Python API Wrappers &g...
      [RT, @RealPython:, List, of, Python, API, Wrap...
    
    
      183068
      724275578879111169
      Watching Boa vs. Python — https://t.co/5THbrirfQO
      [Watching, Boa, vs., Python, —, https://t.co/5...
    
    
      183069
      724275568871673857
      Чертова дюжина вакансий в IT и Digital /  / 1....
      [Чертова, дюжина, вакансий, в, IT, и, Digital,...
    
  

183070 rows × 3 columns



In [7]:

    
df['tokens'] = df.text.str.replace(url, '').str.split()
df









    Out[7]:






  
    
      
      id
      text
      tokens
    
  
  
    
      0
      724276510626979840
      #python never stop learning what you enjoy doi...
      [#python, never, stop, learning, what, you, en...
    
    
      1
      724276498249572352
      Watching Boa vs. Python — https://t.co/Pivpk02s2A
      [Watching, Boa, vs., Python, —]
    
    
      2
      724276388325412866
      Monty Python - The silly walk https://t.co/C0J...
      [Monty, Python, -, The, silly, walk, via, @You...
    
    
      3
      725078887005347840
      Senior Software Engineer Full Stack Python Dja...
      [Senior, Software, Engineer, Full, Stack, Pyth...
    
    
      4
      725078874338541572
      Architect Django Solr Platform Engineer With P...
      [Architect, Django, Solr, Platform, Engineer, ...
    
    
      5
      725078868802068481
      peaceful rain? Python - inevitability
      [peaceful, rain?, Python, -, inevitability]
    
    
      ...
      ...
      ...
      ...
    
    
      183064
      724275847591546880
      Las 3 mejores ides para Python Antes de empeza...
      [Las, 3, mejores, ides, para, Python, Antes, d...
    
    
      183065
      724275810777985026
      Gagal tidur gegara habis vertical limit ada fi...
      [Gagal, tidur, gegara, habis, vertical, limit,...
    
    
      183066
      724275650043875328
      Go boa wkwk💪😄 ★ Boa vs. Python — https://t.co/...
      [Go, boa, wkwk💪😄, ★, Boa, vs., Python, —]
    
    
      183067
      724275609858392066
      RT @RealPython: List of Python API Wrappers &g...
      [RT, @RealPython:, List, of, Python, API, Wrap...
    
    
      183068
      724275578879111169
      Watching Boa vs. Python — https://t.co/5THbrirfQO
      [Watching, Boa, vs., Python, —]
    
    
      183069
      724275568871673857
      Чертова дюжина вакансий в IT и Digital /  / 1....
      [Чертова, дюжина, вакансий, в, IT, и, Digital,...
    
  

183070 rows × 3 columns



In [8]:

    
df['txt'] = df.text.str.replace(url, ' ').str.replace(r'\W+', ' ').str.replace(r'\s+', ' ')
df.txt









    Out[8]:





0          python never stop learning what you enjoy doing 
1                                   Watching Boa vs Python 
2                   Monty Python The silly walk via YouTube
3         Senior Software Engineer Full Stack Python Dja...
4         Architect Django Solr Platform Engineer With P...
5                        peaceful rain Python inevitability
                                ...                        
183064    Las 3 mejores ides para Python Antes de empeza...
183065    Gagal tidur gegara habis vertical limit ada fi...
183066                           Go boa wkwk Boa vs Python 
183067    RT RealPython List of Python API Wrappers gt g...
183068                              Watching Boa vs Python 
183069    Чертова дюжина вакансий в IT и Digital 1 Go ра...
Name: txt, dtype: object



In [9]:

    
df['txt'] = df.txt.str.replace(r'\d+', ' ').str.replace(r'\s+', ' ')
df['tokens'] = df.txt.str.split()
df









    Out[9]:






  
    
      
      id
      text
      tokens
      txt
    
  
  
    
      0
      724276510626979840
      #python never stop learning what you enjoy doi...
      [python, never, stop, learning, what, you, enj...
      python never stop learning what you enjoy doing
    
    
      1
      724276498249572352
      Watching Boa vs. Python — https://t.co/Pivpk02s2A
      [Watching, Boa, vs, Python]
      Watching Boa vs Python
    
    
      2
      724276388325412866
      Monty Python - The silly walk https://t.co/C0J...
      [Monty, Python, The, silly, walk, via, YouTube]
      Monty Python The silly walk via YouTube
    
    
      3
      725078887005347840
      Senior Software Engineer Full Stack Python Dja...
      [Senior, Software, Engineer, Full, Stack, Pyth...
      Senior Software Engineer Full Stack Python Dja...
    
    
      4
      725078874338541572
      Architect Django Solr Platform Engineer With P...
      [Architect, Django, Solr, Platform, Engineer, ...
      Architect Django Solr Platform Engineer With P...
    
    
      5
      725078868802068481
      peaceful rain? Python - inevitability
      [peaceful, rain, Python, inevitability]
      peaceful rain Python inevitability
    
    
      ...
      ...
      ...
      ...
      ...
    
    
      183064
      724275847591546880
      Las 3 mejores ides para Python Antes de empeza...
      [Las, mejores, ides, para, Python, Antes, de, ...
      Las mejores ides para Python Antes de empezar ...
    
    
      183065
      724275810777985026
      Gagal tidur gegara habis vertical limit ada fi...
      [Gagal, tidur, gegara, habis, vertical, limit,...
      Gagal tidur gegara habis vertical limit ada fi...
    
    
      183066
      724275650043875328
      Go boa wkwk💪😄 ★ Boa vs. Python — https://t.co/...
      [Go, boa, wkwk, Boa, vs, Python]
      Go boa wkwk Boa vs Python
    
    
      183067
      724275609858392066
      RT @RealPython: List of Python API Wrappers &g...
      [RT, RealPython, List, of, Python, API, Wrappe...
      RT RealPython List of Python API Wrappers gt g...
    
    
      183068
      724275578879111169
      Watching Boa vs. Python — https://t.co/5THbrirfQO
      [Watching, Boa, vs, Python]
      Watching Boa vs Python
    
    
      183069
      724275568871673857
      Чертова дюжина вакансий в IT и Digital /  / 1....
      [Чертова, дюжина, вакансий, в, IT, и, Digital,...
      Чертова дюжина вакансий в IT и Digital Go разр...
    
  

183070 rows × 4 columns

Notice that we trounced the hashtag #Python
That's not good.
Can you fix it?
Anything else we might be messing up?
what other punctuation marks have special meaning in Tweets



In [10]:

    
# improve on the "stopword" filters here
#
# :-) (ask me about a smilie lexicon)
# not-so-simple words? (ask me about a regex for compound words)
# python variables names with underscores? (regex)



In [11]:

    
f = os.path.join(DATA_PATH, 'text.csv.gz')
df.to_csv(f, encoding='utf8', compression='gzip', quoting=pd.io.common.csv.QUOTE_NONNUMERIC)



In [12]:

    
import gzip
with gzip.open(os.path.join(DATA_PATH, 'text.csv.gz'), 'rb') as f:
    df = pd.read_csv(f)

Make sure you can read it back in!



In [13]:

    
df = pd.DataFrame.from_csv(os.path.join(DATA_PATH, 'text.csv.gz'))
df









    Out[13]:






  
    
      
      id
      text
      tokens
      txt
    
  
  
    
      0
      724276510626979840
      #python never stop learning what you enjoy doi...
      ['python', 'never', 'stop', 'learning', 'what'...
      python never stop learning what you enjoy doing
    
    
      1
      724276498249572352
      Watching Boa vs. Python — https://t.co/Pivpk02s2A
      ['Watching', 'Boa', 'vs', 'Python']
      Watching Boa vs Python
    
    
      2
      724276388325412866
      Monty Python - The silly walk https://t.co/C0J...
      ['Monty', 'Python', 'The', 'silly', 'walk', 'v...
      Monty Python The silly walk via YouTube
    
    
      3
      725078887005347840
      Senior Software Engineer Full Stack Python Dja...
      ['Senior', 'Software', 'Engineer', 'Full', 'St...
      Senior Software Engineer Full Stack Python Dja...
    
    
      4
      725078874338541572
      Architect Django Solr Platform Engineer With P...
      ['Architect', 'Django', 'Solr', 'Platform', 'E...
      Architect Django Solr Platform Engineer With P...
    
    
      5
      725078868802068481
      peaceful rain? Python - inevitability
      ['peaceful', 'rain', 'Python', 'inevitability']
      peaceful rain Python inevitability
    
    
      ...
      ...
      ...
      ...
      ...
    
    
      183064
      724275847591546880
      Las 3 mejores ides para Python Antes de empeza...
      ['Las', 'mejores', 'ides', 'para', 'Python', '...
      Las mejores ides para Python Antes de empezar ...
    
    
      183065
      724275810777985026
      Gagal tidur gegara habis vertical limit ada fi...
      ['Gagal', 'tidur', 'gegara', 'habis', 'vertica...
      Gagal tidur gegara habis vertical limit ada fi...
    
    
      183066
      724275650043875328
      Go boa wkwk💪😄 ★ Boa vs. Python — https://t.co/...
      ['Go', 'boa', 'wkwk', 'Boa', 'vs', 'Python']
      Go boa wkwk Boa vs Python
    
    
      183067
      724275609858392066
      RT @RealPython: List of Python API Wrappers &g...
      ['RT', 'RealPython', 'List', 'of', 'Python', '...
      RT RealPython List of Python API Wrappers gt g...
    
    
      183068
      724275578879111169
      Watching Boa vs. Python — https://t.co/5THbrirfQO
      ['Watching', 'Boa', 'vs', 'Python']
      Watching Boa vs Python
    
    
      183069
      724275568871673857
      Чертова дюжина вакансий в IT и Digital /  / 1....
      ['Чертова', 'дюжина', 'вакансий', 'в', 'IT', '...
      Чертова дюжина вакансий в IT и Digital Go разр...
    
  

183070 rows × 4 columns



In [ ]:

	id	text	tokens
0	724276510626979840	#python never stop learning what you enjoy doi...	[#python, never, stop, learning, what, you, en...
1	724276498249572352	Watching Boa vs. Python — https://t.co/Pivpk02s2A	[Watching, Boa, vs., Python, —, https://t.co/P...
2	724276388325412866	Monty Python - The silly walk https://t.co/C0J...	[Monty, Python, -, The, silly, walk, https://t...
3	725078887005347840	Senior Software Engineer Full Stack Python Dja...	[Senior, Software, Engineer, Full, Stack, Pyth...
4	725078874338541572	Architect Django Solr Platform Engineer With P...	[Architect, Django, Solr, Platform, Engineer, ...
5	725078868802068481	peaceful rain? Python - inevitability	[peaceful, rain?, Python, -, inevitability]
...	...	...	...
183064	724275847591546880	Las 3 mejores ides para Python Antes de empeza...	[Las, 3, mejores, ides, para, Python, Antes, d...
183065	724275810777985026	Gagal tidur gegara habis vertical limit ada fi...	[Gagal, tidur, gegara, habis, vertical, limit,...
183066	724275650043875328	Go boa wkwk💪😄 ★ Boa vs. Python — https://t.co/...	[Go, boa, wkwk💪😄, ★, Boa, vs., Python, —, http...
183067	724275609858392066	RT @RealPython: List of Python API Wrappers &g...	[RT, @RealPython:, List, of, Python, API, Wrap...
183068	724275578879111169	Watching Boa vs. Python — https://t.co/5THbrirfQO	[Watching, Boa, vs., Python, —, https://t.co/5...
183069	724275568871673857	Чертова дюжина вакансий в IT и Digital / / 1....	[Чертова, дюжина, вакансий, в, IT, и, Digital,...

	id	text	tokens	txt
0	724276510626979840	#python never stop learning what you enjoy doi...	[python, never, stop, learning, what, you, enj...	python never stop learning what you enjoy doing
1	724276498249572352	Watching Boa vs. Python — https://t.co/Pivpk02s2A	[Watching, Boa, vs, Python]	Watching Boa vs Python
2	724276388325412866	Monty Python - The silly walk https://t.co/C0J...	[Monty, Python, The, silly, walk, via, YouTube]	Monty Python The silly walk via YouTube
3	725078887005347840	Senior Software Engineer Full Stack Python Dja...	[Senior, Software, Engineer, Full, Stack, Pyth...	Senior Software Engineer Full Stack Python Dja...
4	725078874338541572	Architect Django Solr Platform Engineer With P...	[Architect, Django, Solr, Platform, Engineer, ...	Architect Django Solr Platform Engineer With P...
5	725078868802068481	peaceful rain? Python - inevitability	[peaceful, rain, Python, inevitability]	peaceful rain Python inevitability
...	...	...	...	...
183064	724275847591546880	Las 3 mejores ides para Python Antes de empeza...	[Las, mejores, ides, para, Python, Antes, de, ...	Las mejores ides para Python Antes de empezar ...
183065	724275810777985026	Gagal tidur gegara habis vertical limit ada fi...	[Gagal, tidur, gegara, habis, vertical, limit,...	Gagal tidur gegara habis vertical limit ada fi...
183066	724275650043875328	Go boa wkwk💪😄 ★ Boa vs. Python — https://t.co/...	[Go, boa, wkwk, Boa, vs, Python]	Go boa wkwk Boa vs Python
183067	724275609858392066	RT @RealPython: List of Python API Wrappers &g...	[RT, RealPython, List, of, Python, API, Wrappe...	RT RealPython List of Python API Wrappers gt g...
183068	724275578879111169	Watching Boa vs. Python — https://t.co/5THbrirfQO	[Watching, Boa, vs, Python]	Watching Boa vs Python
183069	724275568871673857	Чертова дюжина вакансий в IT и Digital / / 1....	[Чертова, дюжина, вакансий, в, IT, и, Digital,...	Чертова дюжина вакансий в IT и Digital Go разр...

	id	text	tokens	txt
0	724276510626979840	#python never stop learning what you enjoy doi...	['python', 'never', 'stop', 'learning', 'what'...	python never stop learning what you enjoy doing
1	724276498249572352	Watching Boa vs. Python — https://t.co/Pivpk02s2A	['Watching', 'Boa', 'vs', 'Python']	Watching Boa vs Python
2	724276388325412866	Monty Python - The silly walk https://t.co/C0J...	['Monty', 'Python', 'The', 'silly', 'walk', 'v...	Monty Python The silly walk via YouTube
3	725078887005347840	Senior Software Engineer Full Stack Python Dja...	['Senior', 'Software', 'Engineer', 'Full', 'St...	Senior Software Engineer Full Stack Python Dja...
4	725078874338541572	Architect Django Solr Platform Engineer With P...	['Architect', 'Django', 'Solr', 'Platform', 'E...	Architect Django Solr Platform Engineer With P...
5	725078868802068481	peaceful rain? Python - inevitability	['peaceful', 'rain', 'Python', 'inevitability']	peaceful rain Python inevitability
...	...	...	...	...
183064	724275847591546880	Las 3 mejores ides para Python Antes de empeza...	['Las', 'mejores', 'ides', 'para', 'Python', '...	Las mejores ides para Python Antes de empezar ...
183065	724275810777985026	Gagal tidur gegara habis vertical limit ada fi...	['Gagal', 'tidur', 'gegara', 'habis', 'vertica...	Gagal tidur gegara habis vertical limit ada fi...
183066	724275650043875328	Go boa wkwk💪😄 ★ Boa vs. Python — https://t.co/...	['Go', 'boa', 'wkwk', 'Boa', 'vs', 'Python']	Go boa wkwk Boa vs Python
183067	724275609858392066	RT @RealPython: List of Python API Wrappers &g...	['RT', 'RealPython', 'List', 'of', 'Python', '...	RT RealPython List of Python API Wrappers gt g...
183068	724275578879111169	Watching Boa vs. Python — https://t.co/5THbrirfQO	['Watching', 'Boa', 'vs', 'Python']	Watching Boa vs Python
183069	724275568871673857	Чертова дюжина вакансий в IT и Digital / / 1....	['Чертова', 'дюжина', 'вакансий', 'в', 'IT', '...	Чертова дюжина вакансий в IT и Digital Go разр...